# packages
library(tidyverse)
library(here)
library(lubridate)
library(stopwords)
library(tidytext)
library(tm)
library(RColorBrewer)
library(wordcloud)
library(ggwordcloud)

# get the data
tweets <- read_csv(here::here("data", "revised_data.csv")) #3049 obs.


# get out the thanksgiving?turkey sandwishes related tweets # 2640 obs. 
tweets$tweet = tolower(tweets$tweet)
tweets <- tweets%>%
  filter(!str_detect(tweet, "thanksgiving"))%>%
  filter(!str_detect(tweet, "sandwiches")) %>%
  select(date, username, name, tweet, mentions, replies_count, retweets_count, likes_count, hashtags)

#put time into date format 
tweets$date <- ymd(tweets$date,   quiet = FALSE,
                       tz = "UTC",
                       locale = Sys.getlocale("LC_TIME"),
                       truncated = 0)

# remove duplicates # 2476 obs. 
tweets["duplicated"]= duplicated(tweets$tweet)
tweets <- subset(tweets, tweets$duplicated=="FALSE") 
tweets <- subset(tweets, select = -c(duplicated)) 

# Removing blank spaces, punctuation, links, extra spaces, special characters and other unwanted things.
tweets$tweet = gsub("[[:punct:]]", "", tweets$tweet)
tweets$tweet = gsub("[[:digit:]]", "", tweets$tweet)
tweets$tweet = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", " ",  tweets$tweet)
tweets$tweet = gsub("@\\w+", "", tweets$tweet)
tweets$tweet = gsub("http\\w+", "", tweets$tweet)

tweets<- tweets %>% 
  mutate (year = format(date, "%Y"))%>%
  mutate (year1 = case_when(year == 2021~ "2021-2019",
                            year == 2020~ "2021-2019",
                            year == 2019~ "2021-2019",
                            year == 2018~ "2016-2018",
                            year == 2017~ "2016-2018",
                            year == 2016~ "2016-2018",
                            year== 2015 ~ "2013-2015",
                            year== 2014 ~ "2013-2015",
                            year== 2013 ~ "2013-2015",
                            year== 2012 ~"2009-2012",
                            year== 2011 ~"2009-2012",
                            year== 2010 ~"2009-2012",
                            year== 2009 ~"2009-2012"))
# sentiment function
source("~/Desktop/projects/Twitter_usa/sentiment_function.R")

# getting the opinion lexicons from working directory
tweettext = tweets$tweet
pos = readLines("positive_words.txt")
neg = readLines("negative_words.txt")

neg2 = c(neg, "bearish", "fraud"); tail(neg2)

## apply function score.sentiment
library(plyr) # to be able to run sentiment function you need plr package 
scores = sentimentfun(tweettext, pos, neg, .progress='text')
tweets <- cbind(tweets, scores)
detach("package:plyr", unload = TRUE) # unload plyr which complicates dplyr and here packages
#write down the csv file for tableu analysis
write_csv2(tweets, here::here("data", "tweets.xlsx")) 

# date and tweets numbers
## first chnage the format
tweets$date <- as.Date(as.POSIXct(tweets$date,tz="UTC"))
class(tweets$date)
## plotting 
tweets %>%
  count(date, tweet) %>%
  group_by(date) %>%
  mutate(freq=n())%>%
  ggplot() +
  aes(x=date, y=freq) + 
  geom_area( color="steelblue") + 
  scale_x_date(date_labels = "%m%Y", date_breaks = "4 months")+
  xlab("Date- MM-YYYY")+
  theme(axis.text.x=element_text(angle=90, hjust=1))

tweets %>%
  count(year, tweet) %>%
  group_by(year) %>%
  mutate(freq=n())%>%
  ggplot() +
  aes(x=year, y=freq) + 
  geom_line(color="steelblue") + 
  geom_point()+
  scale_y_continuous("count")+
  xlab("year")+
  theme(axis.text.x=element_text(angle=90, hjust=1))+
  geom_text(aes(label = freq), hjust = 0, vjust=0)



## check out what happens in this date ## 
tweets%>%
  group_by(date) %>%
  summarise(freq=n())%>%
  arrange(desc(freq))

#clean the tweets
replace_reg <- c("https?://[^\\s]+|&amp;|&lt;|&gt;|\bRT\\b") 
custom_stop_words <- tibble(word = c("amp", "president","turkey","turkish","im","house", "vp", "american", "press", "conference"))
# check out the most used words
tweets %>%
  group_by(username) %>%
  filter(date=="2019-10-16" | date=="2019-10-17")%>%
  mutate(text = str_replace_all(tweet, replace_reg, "")) %>%
  unnest_tokens(word, tweet, token = "tweets")%>%
  anti_join(stop_words)%>%
  anti_join(custom_stop_words)%>%
  count(word, sort=T)%>%
  ungroup()%>%
  filter(n > 2) %>% 
  ggplot(aes(x=word, y=n, fill=username)) +
  geom_col(color = "gray40", lty = 2)+
  theme(axis.text.x=element_text(angle=90, hjust=1))+
  coord_flip()+
  labs(x="words", y="Frequency", title = "Most Frequent words")

wordcloud_df <-tweets %>% 
  select(text)%>% 
  unnest_tokens(word, text, token = "tweets") %>%
  anti_join(custom_stop_words) %>%
  inner_join(get_sentiments("bing")) %>% 
  count(sentiment, word, sort = T) %>% 
  top_n(200) 

wordcloud_df %>%
  ggplot() + 
  geom_text_wordcloud_area(aes(label = word, size = n)) +
  scale_size_area(max_size = 15)

bing_word_counts <- tweets %>%
  select(text)%>% 
  unnest_tokens(word, text, token = "tweets") %>%
  anti_join(custom_stop_words) %>%
  inner_join(get_sentiments("bing")) %>% 
  count(sentiment, word, sort = T) %>% 
  ungroup()

bing_word_counts %>%
  group_by(sentiment)%>% 
  summarise(freq=n())%>%
  ungroup() %>%
  ggplot()+
  geom_col(mapping = aes(x=sentiment, y=freq))+
  theme_minimal()+
  labs(x="Sentiment", y= "Frequency", title = "Sentiment in tweets")

# Most tweeters 
tweets %>%
  dplyr::count(username, sort = TRUE) %>%
  dplyr::mutate(username = reorder(username, n)) %>%
  top_n(20) %>%
  ggplot(aes(x = username, y = n)) +
  geom_col() +
  coord_flip() +
  labs(x = "Senators",
       y = "Tweet Counts",
       title = "Which Senators Tweets more about Turkey")
# years of the tweets
tweets %>%
  group_by(username, year1)%>%
  summarise(freq=n())%>%
  ungroup()%>%
  arrange(desc(freq))%>%
  top_n(20)%>%
  ggplot(aes(x = year1, y = freq)) +
  geom_col() +
  theme_minimal()+
  coord_flip() +
  labs(x = "Years",
       y = "Tweet Counts",
       title = "In Which years, Senators Tweeted most about Turkey",
       fill= "Senators")

## text analysis##
# split into words
words <- tweets%>%
  mutate(text = str_replace_all(tweet, replace_reg, "")) %>%
  unnest_tokens(word, text, token = "tweets") %>%
  anti_join(stop_words) 

# plot the top 20 words
words %>%
  count(word, sort = TRUE)%>%
  filter(word!="amp")%>%
  top_n(20)%>%
  mutate(word=reorder(word,n))%>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "Count",
       y = "Unique words",
       title = "Count of unique words found in tweets")
  

# split into word pairs
stopwords_regex = paste(stopwords('en'), collapse = '\\b|\\b')
stopwords_regex = paste0('\\b', stopwords_regex, '\\b')
tweet_txt = stringr::str_replace_all(tweettext, stopwords_regex, '')

# create a bigram text free of stopwords
bigrams <- tweets %>% 
  mutate(text = str_replace_all(tweet, replace_reg, "")) %>%
  mutate(text1= str_replace_all(text, stopwords_regex, "")) %>%
  unnest_tokens(bigram, text1, token = "ngrams", n = 2)
# most used bigrams 
bigrams%>%
  count(bigram, sort = TRUE)%>%
  top_n(20)%>%
  mutate(bigram=reorder(bigram,n))%>%
  ggplot(aes(x = bigram, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "Count",
       y = "bigrams",
       title = "Count of bigrams in tweets")

# splits bigrams into two
bigrams <- bigrams %>%
  separate(bigram, into = c("first","second"), sep = " ", remove = FALSE) %>%
  filter(str_detect(first, "[a-z]") &
           str_detect(second, "[a-z]"))

# a word or bigram agaisnt certain word 
words_count <- words %>%
  group_by(username, word) %>%
  count()

grahams_words <- words_count %>%
  filter(username == "lindseygrahamsc") %>%
  arrange(-n)


ggplot(head(grahams_words, 25)) +
  aes(x = reorder(word,n), y = n) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "Count",
       y = "graham_words",
       title = "Count of words in graham's tweets")

# bigrams for 
graham_bigram<- bigrams%>%
  filter(username == "lindseygrahamsc") %>%
  count(bigram, sort = TRUE)%>%
  top_n(20)%>%
  mutate(bigram=reorder(bigram,n))

ggplot(graham_bigram, aes(x = bigram, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "Count",
       y = "bigrams",
       title = "Count of Graham's bigrams in his tweets")


##Sentiment##
# load lexicon from https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html
bing <- get_sentiments("bing")
View(bing)

# custom stop words, to be removed from analysis
words %>%
  count(word, sort = TRUE)%>%
  filter(word!="amp")%>%
  top_n(40)
# create a custom stopwords according to most frequent words list 
custom_stop_words <- tibble(word = c("president","turkey","turkish","im","house"))

# join sentiments
sentiments <- words %>%
  inner_join(bing, by = "word") %>%
  anti_join(custom_stop_words, by = "word")

sentiments_counts <- sentiments %>%
  group_by(username) %>%
  count(sentiment) %>%
  arrange(-n)

negative_freqs <- sentiments_counts %>%
  left_join(sentiments_counts %>% 
              group_by(username) %>% 
              summarise(total = sum(n))) %>%
  mutate(percent = round(n/total*100,2)) %>%
  filter(sentiment == "negative")

# sentiment per year groups
sentiments%>%
  group_by(username, year1, sentiment)%>%
  summarise(freq=n()) %>%
  ggplot() +
  geom_col(mapping = aes(x=username, y=freq, fill=sentiment))+
  facet_wrap(~year1)+
  labs(x="Frequency", 
       y="Senators", 
       fill="Sentiments", 
       title= "Sentiments per year groups")


#Topic Modeling#
tweet_corpus <- tweets$tweet
corpus<- Corpus(VectorSource(tweet_corpus))
#remove stop words
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removeWords, c("amp","presid", "president","turkey","turkish","im","house"))

# worldcloud per year 
clean_tweet <- read_csv(here("data", "cleantweetusa.csv"))
# color palllet for wordcloud
pal <- brewer.pal(8, "Dark2") 

tweet_9_12 <- clean_tweet%>%
  filter(year1 == "2009-2012")
corpus1<- Corpus(VectorSource(tweet_9_12$tweet))
corpus1 <- tm_map(corpus1, removeWords, stopwords("en"))
corpus1 <- tm_map(corpus1, stemDocument)
corpus1 <- tm_map(corpus1, removeWords, c("amp","presid", "president","turkey","turkish","im","house"))

wd1<- wordcloud(corpus1, min.freq=2, max.words = 300, random.order = TRUE, col = pal, scale = c(3, 0.1))

tweet_13_15 <- clean_tweet%>%
  filter(year1 == "2013-2015")
corpus2<- Corpus(VectorSource(tweet_13_15$tweet))
corpus2 <- tm_map(corpus2, removeWords, stopwords("en"))
corpus2 <- tm_map(corpus2, stemDocument)
corpus2 <- tm_map(corpus2, removeWords, c("amp","presid", "president","turkey","turkish","im","house"))
wd2<- wordcloud(corpus2, min.freq=2, max.words = 300, random.order = TRUE, col = pal, scale = c(3, 0.1))

tweet_16_18 <- clean_tweet%>%
  filter(year1 == "2016-2018")
corpus3<- Corpus(VectorSource(tweet_16_18$tweet))
corpus3 <- tm_map(corpus3, removeWords, stopwords("en"))
corpus3 <- tm_map(corpus3, stemDocument)
corpus3 <- tm_map(corpus3, removeWords, c("amp","presid", "president","turkey","turkish","im","house"))
wd3<- wordcloud(corpus3, min.freq=2, max.words = 300, random.order = TRUE, col = pal, scale = c(3, 0.1))

tweet_19_21 <- clean_tweet%>%
  filter(year1 == "2021-2019")
  corpus4<- Corpus(VectorSource(tweet_19_21$tweet))
  corpus4 <- tm_map(corpus4, removeWords, stopwords("en"))
  corpus4 <- tm_map(corpus4, stemDocument)
  corpus4 <- tm_map(corpus4, removeWords, c("amp","presid", "president","turkey","turkish","im","house"))
  wd4<- wordcloud(corpus4, min.freq=2, max.words = 300, random.order = TRUE, col = pal, scale = c(3, 0.1))

library(gridExtra)
grid.arrange(wordcloud(corpus1, min.freq=2, max.words = 300, random.order = TRUE, col = pal, scale = c(3, 0.1)), wordcloud(corpus2, min.freq=2, max.words = 300, random.order = TRUE, col = pal, scale = c(3, 0.1)), wordcloud(corpus3, min.freq=2, max.words = 300, random.order = TRUE, col = pal, scale = c(3, 0.1)),wordcloud(corpus3, min.freq=2, max.words = 300, random.order = TRUE, col = pal, scale = c(3, 0.1)), nrow=2, ncol=2)  

library(knitr)
kable(tweets%>%
  filter(date > "2016-06-01" &
         date < "2016-09-01")%>%
  select(date, tweet)%>%
  arrange(desc(date))) 

tweets %>%
  group_by(year1)%>%
  summarise(freq=n())%>%
  ungroup()%>%
  arrange(desc(freq))%>%
  top_n(20)

tweets %>%
  group_by(username,replies_count, tweet, year)%>%
  summarise(freq=n())%>%
  ungroup()

tweets%>%
  filter(username=="sensanders")%>%
  select(tweet,date) 


tweets |> 
  group_by(party) |> 
  summarise(freq=n()) |> 
  ggplot(aes(x=reorder(party, freq), y=freq)) +
  geom_col()+
  geom_text(aes(label = freq), hjust = 0, vjust=-0.5)+
  xlab("count")+
  ylab("party affiliation")


# creating a map data 
MainStates <- map_data("state")
ggplot() + 
  geom_polygon( data=MainStates, aes(x=long, y=lat, group=group),
                color="black", fill="gray" )

usa_states <- map_data( "state")

